The dataset is obtained from the 2016 NYC Yellow Cab trip record data made available in Big Query on Google Cloud Platform. The data was originally published by the NYC Taxi and Limousine Commission (TLC). This report will be demostrating on the taxi trip time prediction from the following predictors : Pickup location, Drop Of Location, and Pickup Datetime.
We have done some exploratory data analysis in the previous report New York Taxi Popular Pickup Location Map.
We will continue to massage the data in a different way in order to predict the trip time of taxi ride.
# The URL to download the train.zip is from the kaggle website:
# website:https://www.kaggle.com/c/nyc-taxi-trip-duration/data/
unzip("train.zip")
trainDS<-c()
if (file.exists("train.bin"))
{
trainDS<-readRDS("train.bin")
} else {
if (file.exists("train.csv"))
{
trainDS<-read.table("train.csv", header=TRUE, sep=",")
saveRDS(trainDS, file="train.bin")
}
}
if (file.exists("test.bin"))
{
testDS <- readRDS("test.bin")
} else {
if (file.exists("test.csv"))
{
testDS <-read.table("test.csv", header=TRUE,sep=",")
saveRDS(testDS, file="test.bin")
}
}
names(trainDS)
## [1] "id" "vendor_id" "pickup_datetime"
## [4] "dropoff_datetime" "passenger_count" "pickup_longitude"
## [7] "pickup_latitude" "dropoff_longitude" "dropoff_latitude"
## [10] "store_and_fwd_flag" "trip_duration"
names(testDS)
## [1] "id" "vendor_id" "pickup_datetime"
## [4] "passenger_count" "pickup_longitude" "pickup_latitude"
## [7] "dropoff_longitude" "dropoff_latitude" "store_and_fwd_flag"
testDS$trip_duration <- NA
testDS$dropoff_datetime <- NA
trainDS <- trainDS %>%
filter(is.na(dropoff_datetime)==FALSE) %>%
filter(trip_duration < 24*60*60)
allDS <- rbind(trainDS, testDS)
rm(testDS)
rm(trainDS)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 7601733 406.0 12002346 641.0 8538995 456.1
## Vcells 43749022 333.8 92815084 708.2 92792835 708.0
We will select the following columns: pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, pickup_datetime, dropof_datetime as the input variables, and the trip_duration will be our outcome variable.
In order to translate the geospatial data more accurately, we will be adding some new columns: 1. Trip Distance: We calculate the trip distance via the distance between pickup / drop off location using the geosphere library (distGeo function), we will update the dataset with an additional column named “trip_distance” stored the distance in meters. 2. Trip bearing / haversing. It’s showing the trip’s arc distance and direction of two geo points.
3. Pick up hour : We will derive the pickup hour from the pickup_datetime column using the hour() API.
4. Weekday / Weekend / holiday indicator: We will derive a factor variable (Weekday / Weekend) from the pickup_datetime column. T represent weekdays, F represent Weekends.
pickups <- matrix(c(allDS[,6], allDS[,7]), ncol=2)
dropoffs <- matrix(c(allDS[,8], allDS[,9]), ncol=2)
allDS$trip_distance <- distGeo(pickups, dropoffs)
summary(allDS$trip_distance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1232 2094 3440 3877 1240510
allDS$trip_bearing <- bearing(pickups, dropoffs)
allDS$trip_haversine <- distHaversine(pickups, dropoffs)
allDS <- allDS %>%
filter(!is.na(pickup_datetime)) %>%
mutate(pickup_datetime = as.POSIXct(strptime(pickup_datetime, format="%Y-%m-%d %H:%M:%S"))) %>%
mutate(dropoff_datetime = as.POSIXct(strptime(dropoff_datetime, format="%Y-%m-%d %H:%M:%S"))) %>%
mutate(trip_hour = hour(pickup_datetime)) %>%
mutate(trip_wkday_ind =
ifelse(weekdays(pickup_datetime) %in% c("Saturday", "Sunday"),
"F", "T")) %>%
mutate(trip_date = date(pickup_datetime)) %>%
mutate(trip_duration_computed = as.numeric(difftime(dropoff_datetime, pickup_datetime, units="secs"))) %>%
mutate(trip_month=month(trip_date)) %>%
mutate(trip_wkday=weekdays(trip_date, abbreviate=TRUE))
summary(allDS$trip_duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 397.0 662.0 952.8 1075.0 86392.0 625134
summary(allDS$trip_hour)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 9.00 14.00 13.61 19.00 23.00 1
summary(allDS$trip_wkday_ind)
## Length Class Mode
## 2083774 character character
summary(allDS$trip_speed)
## Length Class Mode
## 0 NULL NULL
set.seed(7553)
pickupKMS <- kmeans(cbind(allDS$pickup_longitude, allDS$pickup_latitude), centers=10)
dropKMS <- kmeans(cbind(allDS$dropoff_longitude, allDS$dropoff_latitude), centers=10)
allDS$trip_pickup_cluster <-pickupKMS$cluster
allDS$trip_dropoff_cluster <- dropKMS$cluster
pickupCenters <- as.data.frame(as.matrix(pickupKMS$centers, ncol=2))
pickupCenters$cnt <- pickupKMS$size
names(pickupCenters) <- c("lng", "lat", "cnt")
dropoffCenters <- as.data.frame(as.matrix(dropKMS$centers, ncol=2))
dropoffCenters$cnt <- dropKMS$size
names(dropoffCenters) <- c("lng", "lat", "cnt")
leaflet(pickupCenters) %>%
addTiles() %>%
setView(lng=-73.99234, lat=40.74488, zoom=10) %>%
addCircleMarkers(lng=~lng, lat=~lat,
popup=as.character(pickupCenters$cnt),
radius=~cnt,
clusterOptions = markerClusterOptions())
### Traffic aggregation for month and weekday Some month might have snow fall and resulted in long pickup time. We will add the month as well.
aggrMonthDS <- allDS %>%
filter(!is.na(trip_duration)) %>%
group_by(trip_month) %>%
summarise_at(vars(trip_duration), mean)
plot_ly(aggrMonthDS, x=~trip_month, y=~trip_duration, type="scatter", mode="markers+lines")
Additionally, we also categorize the trip_hour and found that the speed has been drastically reduced in traffic hour (hour 8 am to 7 pm), while the speed seems to be the highest in midnight (hour 1 to 4).
aggrHourDS <-allDS %>%
filter(!is.na(trip_duration)) %>%
group_by(trip_hour) %>%
summarise_at(vars(trip_duration), mean)
hourPlot <- ggplot(data=aggrHourDS, aes(x=trip_hour, y=trip_duration)) +
geom_point() +
geom_line()
weekdayDS <- allDS %>%
filter(!is.na(trip_duration)) %>%
group_by(trip_wkday) %>%
summarise_at(vars(trip_duration), mean)
weekdayPlot <- ggplot(data=weekdayDS, aes(x=trip_wkday, y=trip_duration))+
geom_point() +
geom_line()
We will subset 70% of data as training data, 30% of data as cross-validation data.
set.seed(3456)
trainDS <-
allDS %>%
filter(!is.na(trip_duration)) %>%
filter(!is.na(trip_duration_computed))
trainIndex <- createDataPartition(trainDS$trip_distance, p = .70,
list = FALSE,
times = 1)
tripTrain <- trainDS[trainIndex, ]
tripCV <- trainDS[-trainIndex,]
dim(tripTrain)
## [1] 1021048 22
dim(tripCV)
## [1] 437590 22
We fit a linear regression model first and check the residuals. It looks like there are some pattern around trip_distance between 0 to 50 KM. The group seem to be splitting to two. some in the high range (i.e. residuals around 20-25, some in lower range(residuals near 0)). It looks like we might be missing some variables or need to try a different model.
modelFit <- lm(data=tripTrain, trip_duration_computed ~ trip_distance +
factor(trip_hour) +
factor(trip_wkday_ind) +
factor(trip_pickup_cluster) +
factor(trip_month) +1)
coef(modelFit)
## (Intercept) trip_distance
## 332.43017007 0.09993748
## factor(trip_hour)1 factor(trip_hour)2
## 14.86859980 14.69639827
## factor(trip_hour)3 factor(trip_hour)4
## -17.41402895 -29.68062559
## factor(trip_hour)5 factor(trip_hour)6
## -205.67117976 -174.41020993
## factor(trip_hour)7 factor(trip_hour)8
## -10.07252543 129.87479924
## factor(trip_hour)9 factor(trip_hour)10
## 154.46184330 154.29689440
## factor(trip_hour)11 factor(trip_hour)12
## 168.68456853 198.23635206
## factor(trip_hour)13 factor(trip_hour)14
## 229.68390504 234.19023104
## factor(trip_hour)15 factor(trip_hour)16
## 276.11495587 258.02561028
## factor(trip_hour)17 factor(trip_hour)18
## 208.45064161 186.08142069
## factor(trip_hour)19 factor(trip_hour)20
## 80.59358529 42.88391821
## factor(trip_hour)21 factor(trip_hour)22
## 27.64173020 76.18822395
## factor(trip_hour)23 factor(trip_wkday_ind)T
## 24.61465678 69.13731375
## factor(trip_pickup_cluster)2 factor(trip_pickup_cluster)3
## 90.37801977 23.52116064
## factor(trip_pickup_cluster)4 factor(trip_pickup_cluster)5
## 333.48277550 26.04314924
## factor(trip_pickup_cluster)6 factor(trip_pickup_cluster)7
## 66.16152220 101.63280849
## factor(trip_pickup_cluster)8 factor(trip_pickup_cluster)9
## 82.02865474 -42.43361252
## factor(trip_pickup_cluster)10 factor(trip_month)2
## 131.18284524 -3.29422869
## factor(trip_month)3 factor(trip_month)4
## 23.06152272 57.09386229
## factor(trip_month)5 factor(trip_month)6
## 79.64898328 86.31084174
anova(modelFit)
## Analysis of Variance Table
##
## Response: trip_duration_computed
## Df Sum Sq Mean Sq F value
## trip_distance 1 2.2291e+11 2.2291e+11 22505.186
## factor(trip_hour) 23 1.1101e+10 4.8266e+08 48.730
## factor(trip_wkday_ind) 1 9.4628e+08 9.4628e+08 95.539
## factor(trip_pickup_cluster) 9 4.7864e+09 5.3183e+08 53.695
## factor(trip_month) 5 1.2961e+09 2.5922e+08 26.171
## Residuals 1021008 1.0113e+13 9.9047e+06
## Pr(>F)
## trip_distance < 2.2e-16 ***
## factor(trip_hour) < 2.2e-16 ***
## factor(trip_wkday_ind) < 2.2e-16 ***
## factor(trip_pickup_cluster) < 2.2e-16 ***
## factor(trip_month) < 2.2e-16 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
predictData <- predict(modelFit, newdata=tripCV)
predictData <- as.data.frame(predictData)
t.test(predictData, tripCV$trip_duration_computed)
##
## Welch Two Sample t-test
##
## data: predictData and tripCV$trip_duration_computed
## t = 0.57382, df = 455340, p-value = 0.5661
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.636582 12.131181
## sample estimates:
## mean of x mean of y
## 953.1921 950.4448
rmseInidialModel <- sqrt(1/nrow(tripCV) *
(sum((log2(predictData[,1]+1) - log2(tripCV$trip_duration_computed+1))^2)))
residualsCV <- resid(modelFit)
plot(x=tripTrain$trip_duration, residualsCV)
We will try split out the data between the dataset that has higher residuals and lower residuals and run different model. We have 1894 rows that are having large residuals We want to know how do we know these rows are residuals, we will run the classification model to categorize the residual categories.
quantile(modelFit$residuals)
## 0% 25% 50% 75% 100%
## -123803.1451 -369.8356 -194.2295 46.6426 85958.9993
tripTrain$quantileRank <- 1
for (i in 1:4)
{
curRowIndex <- which(modelFit$resid <= quantile(modelFit$residuals)[[i+1]] &
modelFit$resid > quantile(modelFit$residuals)[[i]])
print(paste("Current range:",
as.character(quantile(modelFit$residuals)[[i+1]]),
as.character(quantile(modelFit$residuals)[[i]])))
print(length(curRowIndex))
tripTrain[curRowIndex, "quantileRank"] <- i
}
## [1] "Current range: -369.83559334701 -123803.145099442"
## [1] 255261
## [1] "Current range: -194.229529016227 -369.83559334701"
## [1] 255262
## [1] "Current range: 46.6426028667614 -194.229529016227"
## [1] 255262
## [1] "Current range: 85958.9993074381 46.6426028667614"
## [1] 255262
rm(list="predictData")
rm(modelFit)
summary(tripTrain$quantileRank)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.75 2.50 2.50 3.25 4.00
From the above model, we can see that some categories of data are having high residuals. We run the rpart classification model to find what kind of categories will result in such high residuals.
results_model <- rpart(data=tripTrain,
quantileRank ~
trip_distance +
as.factor(trip_hour) +
factor(trip_wkday) +
factor(trip_pickup_cluster) +
factor(trip_month),
method="class")
print(results_model)
## n= 1021048
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1021048 765786 1 (0.25000000 0.25000000 0.25000000 0.25000000)
## 2) trip_distance< 1707.387 408395 242636 1 (0.40587911 0.32607647 0.18769084 0.08035358)
## 4) as.factor(trip_hour)=8,9,10,11,12,13,14,15,16,17,18,22 262584 132858 1 (0.49403619 0.26835984 0.15174192 0.08586205) *
## 5) as.factor(trip_hour)=0,1,2,3,4,5,6,7,19,20,21,23 145811 83110 2 (0.24712127 0.43001557 0.25242951 0.07043364) *
## 3) trip_distance>=1707.387 612653 390207 4 (0.14609085 0.19928736 0.29153534 0.36308645)
## 6) trip_distance< 3176.34 287969 189316 3 (0.13683417 0.26247617 0.34258201 0.25810764) *
## 7) trip_distance>=3176.34 324684 176565 4 (0.15430080 0.14324389 0.24626098 0.45619433) *
It looks like the rush hour and trip distance has some effect on residuals, we will form a categorized trip_distance variable as a new feature.
add_rushhour_ind <- function(tripTrain)
{
tripTrain <- tripTrain %>%
mutate(trip_rush_hour = factor(ifelse(trip_hour %in% c(8:18), "R", "N"))) %>%
mutate(trip_distance_cat =
factor(case_when(
trip_distance < 1681.444 ~ 'DIST1',
trip_distance >=1681.444 & trip_distance < 3025.273 ~ 'DIST2',
trip_distance >= 3025.273 ~ 'DIST3'))) %>%
mutate(trip_cluster_cat =
paste(as.character(trip_pickup_cluster),
"_",
as.character(trip_dropoff_cluster), sep=""))
tripTrain
}
tripTrain <- add_rushhour_ind(tripTrain)
tripCV <- add_rushhour_ind(tripCV)
We are wondering if short distance ride will have different outcome than long distance ride , since usually in short distance ride in down town the speed is much slower than long distance ride. We will subset the data according to the trip_distance. When plotting the predicted v.s. outcome duration, we found there are 219 rows that has large number of residues. When look at the data individually, we can see that the trip_duration is extremly unreasonable (nearly 24 hours for 5 KM ride). After drop the outliner, we can see the residuals dropped significantly from 0.9 to 0.65.
s1 <- subset(tripTrain, trip_distance_cat == 'DIST1')
s2 <- subset(tripTrain, trip_distance_cat == 'DIST2')
s3 <- subset(tripTrain, trip_distance_cat == 'DIST3')
v1 <- subset(tripCV, trip_distance_cat == 'DIST1')
v2 <- subset(tripCV, trip_distance_cat == 'DIST2')
v3 <- subset(tripCV, trip_distance_cat == 'DIST3')
make_xgb_matrix<- function(d)
{
sparse_matrix <- sparse.model.matrix(trip_duration~
trip_distance+
trip_rush_hour+
factor(trip_cluster_cat), data = d)
output_vector <- d$trip_duration
ret <- xgb.DMatrix(data=sparse_matrix, label=output_vector)
ret
}
make_xgb_model_prediction <- function(s, v)
{
print("Boosting..")
sparse_matrix_s <- make_xgb_matrix(s)
output_vector <- s$trip_duration
bst <- xgboost(data = sparse_matrix_s, label = output_vector, max_depth = 4,
eta = 1, nthread = 2, nrounds = 10,
eval.metric = "rmse", objective = "reg:linear")
output_vector <- s$trip_duration
foldsCV <- createFolds(output_vector, k=7, list=TRUE, returnTrain=FALSE)
param <- list(colsample_bytree = 0.7
, booster = "gbtree"
, objective = "reg:linear"
, subsample = 0.7
, max_depth = 5
, eta = 0.037
, eval_metric = 'rmse'
, base_score = 0.012 #average
, seed = 4321)
bst <- xgb.cv(data=sparse_matrix_s,
params=param,
nrounds = 30,
folds=foldsCV,label=output_vector,
prediction=TRUE, nthread = 2,
early_stopping_rounds = 15,print_every_n = 5)
nrounds <- bst$best_iteration
print("training the xgb...")
xgb <- xgb.train(params = param
, data = sparse_matrix_s
, nrounds = nrounds
, verbose = 1
, print_every_n = 5
#, feval = amm_mae
)
sparse_matrix_v <- make_xgb_matrix(v)
print("Predict using the xgb boosting model...")
predictedxgb <- predict(xgb, sparse_matrix_v)
predictedxgb
}
make_linear_model <- function(s, v)
{
print("Linear model ...")
split_model <- lm(data=s,
trip_duration ~
trip_distance +
trip_rush_hour +
factor(trip_pickup_cluster) +
1)
print("Prediction using the linear model ...")
predictv <- predict(split_model, newdata=v)
predictv
}
distance_model <- function(s, v)
{
predictedxgb <- make_xgb_model_prediction(s, v)
xgbRmse <- rmsle(predictedxgb, v1$trip_duration)
predictedlinear <- make_linear_model(s,v)
linearRmse <- rmsle(predictedlinear, v$trip_duration)
print(paste("rmse pair:", as.character(xgbRmse), as.character(linearRmse)))
if (xgbRmse < linearRmse)
print("Choose xgb model!")
else
print("Choose linear model!")
}
distance_model(s1, v1)
## [1] "Boosting.."
## Warning in xgb.get.DMatrix(data, label, missing, weight): xgboost: label
## will be ignored.
## [1] train-rmse:2973.632812
## [2] train-rmse:2968.411133
## [3] train-rmse:2962.264648
## [4] train-rmse:2957.330322
## [5] train-rmse:2950.386719
## [6] train-rmse:2946.393311
## [7] train-rmse:2943.231934
## [8] train-rmse:2939.215820
## [9] train-rmse:2934.478516
## [10] train-rmse:2931.830078
## Warning in xgb.get.DMatrix(data, label, missing): xgboost: label will be
## ignored.
## [1] train-rmse:3024.354074+17.627287 test-rmse:3023.182408+104.433288
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 15 rounds.
##
## [6] train-rmse:3007.653216+17.772939 test-rmse:3009.235526+104.685749
## [11] train-rmse:2995.541120+17.714109 test-rmse:2999.589983+104.852848
## [16] train-rmse:2986.372524+17.557126 test-rmse:2993.011300+104.855324
## [21] train-rmse:2979.674247+17.590784 test-rmse:2988.489153+104.829320
## [26] train-rmse:2974.069894+17.664203 test-rmse:2985.397949+104.818495
## [30] train-rmse:2970.563023+17.626259 test-rmse:2983.671491+104.702497
## [1] "training the xgb..."
## [1] "Predict using the xgb boosting model..."
## [1] "Linear model ..."
## [1] "Prediction using the linear model ..."
## [1] "rmse pair: 0.653423716189369 0.735426318259544"
## [1] "Choose xgb model!"
distance_model(s2, v2)
## [1] "Boosting.."
## Warning in xgb.get.DMatrix(data, label, missing, weight): xgboost: label
## will be ignored.
## [1] train-rmse:3070.697266
## [2] train-rmse:3062.504395
## [3] train-rmse:3056.660889
## [4] train-rmse:3049.972412
## [5] train-rmse:3039.898926
## [6] train-rmse:3033.347412
## [7] train-rmse:3026.932861
## [8] train-rmse:3023.590332
## [9] train-rmse:3016.962158
## [10] train-rmse:3009.131836
## Warning in xgb.get.DMatrix(data, label, missing): xgboost: label will be
## ignored.
## [1] train-rmse:3184.566476+37.267432 test-rmse:3177.803502+225.003176
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 15 rounds.
##
## [6] train-rmse:3148.030657+37.378983 test-rmse:3144.575405+226.805373
## [11] train-rmse:3121.661935+37.125142 test-rmse:3121.383266+227.698101
## [16] train-rmse:3102.651577+36.811311 test-rmse:3105.427176+228.369222
## [21] train-rmse:3088.800886+37.321075 test-rmse:3094.421631+228.726828
## [26] train-rmse:3077.820905+37.607019 test-rmse:3086.865165+228.907412
## [30] train-rmse:3071.346715+36.919436 test-rmse:3082.543666+229.012360
## [1] "training the xgb..."
## [1] "Predict using the xgb boosting model..."
## Warning in log(1 + actual) - log(1 + predicted): longer object length is
## not a multiple of shorter object length
## [1] "Linear model ..."
## [1] "Prediction using the linear model ..."
## [1] "rmse pair: 0.859292823314624 0.471164060155784"
## [1] "Choose linear model!"
distance_model(s3, v3)
## [1] "Boosting.."
## Warning in xgb.get.DMatrix(data, label, missing, weight): xgboost: label
## will be ignored.
## [1] train-rmse:3351.978027
## [2] train-rmse:3343.138916
## [3] train-rmse:3336.958008
## [4] train-rmse:3331.784912
## [5] train-rmse:3329.258057
## [6] train-rmse:3325.815674
## [7] train-rmse:3322.960693
## [8] train-rmse:3319.055420
## [9] train-rmse:3311.994629
## [10] train-rmse:3310.536377
## Warning in xgb.get.DMatrix(data, label, missing): xgboost: label will be
## ignored.
## [1] train-rmse:3705.926862+34.223443 test-rmse:3701.259835+199.970483
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 15 rounds.
##
## [6] train-rmse:3598.724784+35.166956 test-rmse:3596.225412+204.541969
## [11] train-rmse:3522.176723+35.544509 test-rmse:3521.843715+207.976510
## [16] train-rmse:3467.232352+35.213898 test-rmse:3469.152623+210.582127
## [21] train-rmse:3429.229946+35.859256 test-rmse:3432.812639+211.817860
## [26] train-rmse:3400.504046+35.850150 test-rmse:3406.551060+212.858274
## [30] train-rmse:3384.608608+35.581857 test-rmse:3391.860317+213.521667
## [1] "training the xgb..."
## [1] "Predict using the xgb boosting model..."
## Warning in log(1 + actual) - log(1 + predicted): longer object length is
## not a multiple of shorter object length
## [1] "Linear model ..."
## [1] "Prediction using the linear model ..."
## [1] "rmse pair: 1.28805539066575 0.436587571626313"
## [1] "Choose linear model!"
rm(s1)
rm(s2)
rm(s3)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5684327 303.6 12002346 641.0 9565989 510.9
## Vcells 129182633 985.6 290942751 2219.8 290933937 2219.7
For the test data, we will categorize the same DIST interval and choose xgb model for DIST1 (short distance), while using the linear model for DIST2 and DIST3.
allDS <- add_rushhour_ind(allDS)
testDS <- allDS %>%
filter(is.na(dropoff_datetime))
trainDS <- allDS %>%
filter(!is.na(dropoff_datetime))
final_model <- function(testDS)
{
testList <- list()
trainList <- list()
predictionList <- list()
final_prediction <- data.frame()
for (i in 1:3)
{
print("A New iteration ")
trainList[[i]] <- subset(trainDS,
trip_distance_cat == paste('DIST', as.character(i), sep=""))
testList[[i]] <- subset(testDS,
trip_distance_cat == paste('DIST', as.character(i), sep=""))
## reset to 0 to avoid error
testList[[i]]$trip_duration <- 0
if (i == 1)
{
predictionList[[i]] <-
make_xgb_model_prediction(trainList[[i]], testList[[i]])
}
else
{
predictionList[[i]] <-
make_linear_model(trainList[[i]],testList[[i]])
}
# Construct a id <- prediction data frame
currResult <- as.data.frame(cbind(as.character(testList[[i]]$id), predictionList[[i]]))
final_prediction <- as.data.frame(
rbind(final_prediction, currResult))
}
final_prediction
}
final_prediction <- final_model(testDS)
## [1] "A New iteration "
## [1] "Boosting.."
## Warning in xgb.get.DMatrix(data, label, missing, weight): xgboost: label
## will be ignored.
## [1] train-rmse:2962.546143
## [2] train-rmse:2957.165527
## [3] train-rmse:2952.653320
## [4] train-rmse:2949.586670
## [5] train-rmse:2946.208252
## [6] train-rmse:2942.610107
## [7] train-rmse:2940.572510
## [8] train-rmse:2937.297363
## [9] train-rmse:2934.045166
## [10] train-rmse:2930.404053
## Warning in xgb.get.DMatrix(data, label, missing): xgboost: label will be
## ignored.
## [1] train-rmse:3011.734619+19.474785 test-rmse:3009.781529+119.746790
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 15 rounds.
##
## [6] train-rmse:2996.010533+19.727392 test-rmse:2995.719343+120.070375
## [11] train-rmse:2984.668318+19.386891 test-rmse:2985.990723+120.330875
## [16] train-rmse:2976.071568+19.261705 test-rmse:2979.304687+120.452704
## [21] train-rmse:2970.003104+19.133495 test-rmse:2974.709996+120.498966
## [26] train-rmse:2965.334752+19.141855 test-rmse:2971.498675+120.506581
## [30] train-rmse:2962.165911+18.954979 test-rmse:2969.670585+120.482912
## [1] "training the xgb..."
## [1] "Predict using the xgb boosting model..."
## [1] "A New iteration "
## [1] "Linear model ..."
## [1] "Prediction using the linear model ..."
## [1] "A New iteration "
## [1] "Linear model ..."
## [1] "Prediction using the linear model ..."
write.table(final_prediction, file="prediction_trip_duration.csv",sep=",",
col.names=c("id", "trip_duration"), row.names=FALSE,quote=FALSE)